# Data In:  celebrity_perspectives_data.dta
#           top_words.csv

library(haven)
library(tidyverse)
library(tidytext)
library(lubridate)

t_df <- read_dta("celebrity_perspectives_data.dta")

trump_dist_time <- t_df %>%
  filter(is_retweet==0,
         endorsed!="none",
         endorsed!="unclear") %>%
  mutate(month = round_date(created_at, "month")) %>%
  filter(month >= "2016-01-01" & month <= "2017-01-01") %>%
  group_by(screen_name,endorsed,month) %>%
  summarize(tweets = n(),
            t_mention = sum(mentionsTrump),
            h_mention = sum(mentionsHillary),
            b_mention = sum(mentionsBernie)) %>%
  mutate(all_pols = t_mention + h_mention + b_mention)


##################
# FIGURE 1
##################
pdf("Figure1.pdf", 8.5, 5, useDingbats = FALSE)
ggplot(trump_dist_time, aes(y=all_pols, x=month)) +
  geom_jitter(size=.9) +
  labs(x= "", y= "Number of times a candidates was mentioned",
       title="Monthly Mentions of Any Candidate – by Each Celebrity",subtitle="(February 2016  - January 2017)") 
# + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
dev.off()

##################
# FIGURE 2
##################
pdf("Figure2.pdf", 8.5, 5, useDingbats = FALSE)
t_df %>%
  filter(is_retweet==0,
         endorsed!="none",
         endorsed!="unclear") %>%
  group_by(month = round_date(created_at, "month"),
           endorsed) %>%
  summarize(tweets = n(),
            t_mention = sum(mentionsTrump),
            percent = t_mention / tweets) %>%
  filter(year(month)>=2016) %>%
  ungroup() %>%
  ggplot(aes(as.Date(month), percent,color=endorsed,linetype=endorsed)) +
  geom_line(size=1.2) +
  geom_point(aes(shape = factor(endorsed)),size=2.5) +
  geom_vline(xintercept = as.integer(as.Date("2016-11-08")), color = "red", lty = 2) +
  scale_y_continuous(labels = percent) +
  labs(x = "Month",
       y = "% of tweets that mention Trump",
       title="Proportions of tweets mentioning \"Trump\" before and after he took office",subtitle="Statistics calculated monthly") +
  theme_bw() + theme(legend.position="none") +
  annotate("text", label = "Sanders-endorsing\ncelebrities", x = as.Date("2017-12-01"), y = .16, size = 5, colour = "darkgreen") +
  annotate("text", label = "Clinton-\nendorsing\ncelebrities", x = as.Date("2018-04-01"), y = .062, size = 5, colour = "darkred") +
  annotate("text", label = "Trump-endorsing\ncelebrities", x = as.Date("2016-05-01"), y = .105, size = 5, colour = "darkblue")
dev.off()


##################
# APPENDIX FIGURES
##################

pdf("FigureA1.pdf", 8.5, 5, useDingbats = FALSE)
t_df %>%
  filter(is_retweet==0) %>%
  group_by(month = round_date(created_at, "month")) %>%
  summarize(tweets = n(),
            t_mention = sum(mentionsTrump),
            percent = t_mention / tweets) %>%
  filter(year(month)>=2016) %>%
  ungroup() %>%
  ggplot(aes(as.Date(month), percent)) +
  geom_line() +
  geom_point() +
  geom_vline(xintercept = as.integer(as.Date("2016-11-08")), color = "red", lty = 2) +
  scale_y_continuous(labels = percent) +
  labs(x = "Time",
       y = "% of tweets that mention Trump",
       subtitle = paste0("Monthly average for celebrities the corpus"),
       title = "Mentions of Trump")
dev.off()


# To show the most common words, we load a summary file
# because we are not allowed to publish all tweets.
# The code below would produce the most common word
# when the text of all tweets is loaded into memory

# Prepare a tidy (word-level) dataset:

# library(tidytext)
# t_words_withRTs <- t_df %>%
  #   select(status_id,
  #          screen_name,
  #          text,
  #          created_at,
  #          retweet_count,
  #          mentions_screen_name,
  #          endorsed,
  #          mentionsTrump,
  #          mentionsBernie,
  #          mentionsHillary) %>%
  #   unnest_tokens(word, text)

#   1. Remove stop words 
#   2. Remove most URLs

# my_stop_words <- tibble(
  #   word = c("https","http","t.co","amp","rt","2")
  #  ,lexicon = "twitter"
  
#  all_stop_words <- stop_words %>%
    #   bind_rows(my_stop_words)
    
#    t_w_cleaned <- t_words_withRTs %>%
    #   anti_join(all_stop_words, by = "word")
 
#    Identify the most frequently occuring words:
    
#    top_words <- t_w_cleaned %>%
    #   group_by(word) %>%
    #   tally %>%
    #   arrange(desc(n)) %>%
    #   head(15)
    
#   Plot the most common words:

top_words <- read.csv("top_words.csv")

pdf("FigureA2.pdf", 8.5, 5, useDingbats = FALSE)  
  ggplot(top_words,aes(x=fct_reorder(word,n),y=n)) +
  geom_point() +
  coord_flip() +
  labs(x="",y="Number of mentions",
       title="Words in the celebrity corpus ordered by frequency")
dev.off()


